PROJECT

1. INTRODUCTION

Our data can be downloaded from these links: train and test. This is an anonymous data, so we do not have any information about the features. Our aim is to create the best model for this classification data. Our accuracy metric will be mean of AUC and Balanced Error Rate (BER) scores, which can be calculated with *calculate_score_** functions.

calculate_score = function(true, pred){
  AUC = auc(true, pred)
  BER = sum(diag(table(true, pred > 0.5))) / length(true)
  OverAll = (AUC + BER ) / 2
  print(paste("AUC is: ", as.character(round(AUC, 4))))
  print(paste("BER is: ", as.character(round(BER, 4))))
  print(paste("OverAll is: ", as.character(round(OverAll, 4))))
  c(round(AUC, 4), round(BER, 4), round(OverAll, 4))
}

calculate_score_glm = function(model, data){
  columns = setdiff(colnames(data), "y")
  X = data.matrix(data[,..columns, with = FALSE])
  y = data.matrix(data[, y])
  pred <- predict(model, X, type = "response")
  calculate_score(y, pred)
}

calculate_score_rf = function(model, data){
  target = data[, y]
  pred <- predict(model, newdata = data, type = "prob")
  calculate_score(target, pred[, "b"])
}

calculate_score_sgb = function(model, data){
  target = data[, y]
  pred <- predict(model, newdata = data, type = "response")
  calculate_score(target, 1- pred)
}

2. IMPLEMENTATION

At the beginning, we need to import data and packages.

#Required packages
pti <- c("data.table", "tidyverse", "glmnet", "caret", "rpart", "randomForest", "gbm", "plyr", "ROSE", "pROC", "corrplot", "mlbench")
pti <- pti[!(pti %in% installed.packages())]
if(length(pti)>0){
    install.packages(pti)
}

library(data.table)
library(tidyverse)
library(glmnet)
library(caret)
library(rpart)
library(randomForest)
library(gbm)
library(plyr)
library(ROSE)
library(pROC)
library(corrplot)
library(mlbench)

scores = data.table()
train_data = fread("~/Desktop/582_proje/IE582_Fall20_ProjectTrain.csv")
test_data = fread("~/Desktop/582_proje/IE582_Fall20_ProjectTest.csv")
head(train_data)
##    x1 x2 x3 x4 x5 x6 x7 x8    x9   x10   x11 x12 x13 x14 x15 x16 x17 x18 x19
## 1: 27  1  1  1 18  3  1 28 119.9 154.0 121.4   1   0 404   1   0   0   0   0
## 2: 30  0  1  1 18 13  3 19  86.7 132.9 129.0   0   0 303   1   0   0   0   0
## 3: 37  0  1  1  1  3 14 33 174.0 128.1 100.2   0   0 454   1   0   0   0   0
## 4: 29  0  1  1 14  9  3 29   8.8 126.8  55.5   1   0 383   1   1   0   0   0
## 5: 33  1  1  0  2 15 12 39  55.0 187.6 156.6   1   0 404   0   0   0   0   0
## 6: 33  0  0  1  5  5 12 26 144.7 150.9  57.7   0   0 404   1   0   0   0   0
##    x20 x21 x22 x23 x24 x25 x26 x27 x28 x29 x30 x31 x32 x33 x34 x35 x36 x37 x38
## 1:   0   0   0   1   0   0   0 115   0   0 562   0 389   0   0   0   0   0   0
## 2:   0   0   1   1   0   0   0 129   0   0 624   0 266   0   0   0   0   0   0
## 3:   0   0   0   1   0   0   0 173   0   0 562   0 411   0   1   0   0   0   0
## 4:   0   0   0   0   1   0   0 281   0   0 624   0 611   0   0   0   0   0   0
## 5:   0   0   0   1   0   0   1 220   0   0 562   0 289   0   1   0   0   0   0
## 6:   0   0   0   0   0   1   0 173   0   0 812   0 255   0   0   0 512   0   0
##    x39 x40 x41 x42 x43 x44 x45 x46 x47 x48 x49 x50 x51 x52 x53 x54 x55 x56 x57
## 1:   0   0   1   0   0   1   1   0   0   0   0   0   0   0   0   0   0   1   0
## 2:   0   0   1   0   0   1   0   0   0   0   0   0   0   0   0   0   0   1   0
## 3:   0   0   0   0   0   1   0   0   0   0   0   0   1   0   1   0   0   1   0
## 4:   1   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## 5:   0   0   1   0   0   1   0   0   0   0   0   0   0   0   0   0   0   1   0
## 6:   0   0   0   0   0   1   0   0   0   0   0   0   0   0   1   1   0   0   0
##    x58 x59 x60 y
## 1:   0   0   0 a
## 2:   0   0   0 a
## 3:   0   0   0 b
## 4:   0   0   0 a
## 5:   0   0   0 a
## 6:   1   0   0 b
str(train_data)
## Classes 'data.table' and 'data.frame':   2074 obs. of  61 variables:
##  $ x1 : int  27 30 37 29 33 33 29 27 28 27 ...
##  $ x2 : int  1 0 0 0 1 0 1 1 1 0 ...
##  $ x3 : int  1 1 1 1 1 0 0 1 1 1 ...
##  $ x4 : int  1 1 1 1 0 1 1 1 1 0 ...
##  $ x5 : int  18 18 1 14 2 5 16 13 0 8 ...
##  $ x6 : int  3 13 3 9 15 5 1 4 0 18 ...
##  $ x7 : int  1 3 14 3 12 12 2 17 2 18 ...
##  $ x8 : int  28 19 33 29 39 26 24 34 40 26 ...
##  $ x9 : num  119.9 86.7 174 8.8 55 ...
##  $ x10: num  154 133 128 127 188 ...
##  $ x11: num  121.4 129 100.2 55.5 156.6 ...
##  $ x12: int  1 0 0 1 1 0 0 0 0 1 ...
##  $ x13: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ x14: int  404 303 454 383 404 404 404 30 202 404 ...
##  $ x15: int  1 1 1 1 0 1 0 1 1 1 ...
##  $ x16: int  0 0 0 1 0 0 0 0 1 0 ...
##  $ x17: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ x18: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ x19: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ x20: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ x21: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ x22: int  0 1 0 0 0 0 0 0 0 0 ...
##  $ x23: int  1 1 1 0 1 0 1 0 0 0 ...
##  $ x24: int  0 0 0 1 0 0 0 1 1 0 ...
##  $ x25: int  0 0 0 0 0 1 0 0 0 0 ...
##  $ x26: int  0 0 0 0 1 0 0 0 0 0 ...
##  $ x27: int  115 129 173 281 220 173 103 102 99 112 ...
##  $ x28: int  0 0 0 0 0 0 1 0 0 0 ...
##  $ x29: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ x30: int  562 624 562 624 562 812 812 375 187 624 ...
##  $ x31: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ x32: int  389 266 411 611 289 255 466 200 910 244 ...
##  $ x33: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ x34: int  0 0 1 0 1 0 0 0 0 0 ...
##  $ x35: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ x36: int  0 0 0 0 0 512 0 0 0 0 ...
##  $ x37: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ x38: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ x39: int  0 0 0 1 0 0 0 0 0 0 ...
##  $ x40: int  0 0 0 0 0 0 0 0 0 1 ...
##  $ x41: int  1 1 0 1 1 0 1 1 1 1 ...
##  $ x42: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ x43: int  0 0 0 0 0 0 0 0 1 0 ...
##  $ x44: int  1 1 1 0 1 1 1 0 1 1 ...
##  $ x45: int  1 0 0 0 0 0 0 0 0 0 ...
##  $ x46: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ x47: int  0 0 0 0 0 0 1 0 0 0 ...
##  $ x48: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ x49: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ x50: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ x51: int  0 0 1 0 0 0 0 0 0 0 ...
##  $ x52: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ x53: int  0 0 1 0 0 1 0 0 0 0 ...
##  $ x54: int  0 0 0 0 0 1 0 1 0 1 ...
##  $ x55: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ x56: int  1 1 1 0 1 0 1 0 0 0 ...
##  $ x57: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ x58: int  0 0 0 0 0 1 0 1 0 1 ...
##  $ x59: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ x60: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ y  : chr  "a" "a" "b" "a" ...
##  - attr(*, ".internal.selfref")=<externalptr>

As we can see, we have only numeric features. Before moving on to the model creation step, we need to check some information about the data. We can check whether this data has NA values.

anyNA(train_data)
## [1] FALSE
anyNA(test_data)
## [1] FALSE

As we can see, there is no NA values in the data. So, we don’t need any missing value imputation processes. We need to check whether there are duplicated rows.

sum(duplicated(train_data))
## [1] 0
sum(duplicated(test_data))
## [1] 0

So, we do not have any duplicated rows. Now, we are ready to get some insights about the data. We can start from the target variable.

table(train_data$y)
## 
##    a    b 
## 1565  509
prop.table(table(train_data$y))
## 
##         a         b 
## 0.7545805 0.2454195
ggplot(train_data, aes(x = y)) + geom_bar()

It means that this data is an imbalanced classification data. We have 75% of data with one class and the rest is the other class. There is one big problem about this we can not understand the pattern in the data for minority class, which refers to the class that has lower instances rather than the other class. So, it can drive the model to predict all instance for the majority class, which can end up with a high accuracy score. So, in this type of problems, to predict minority class correctly is more important then to predict the majority class correctly.

To handle this type of data, there are some methods like listed below:

  1. Oversampling: We can replicate instances from the minority class, so that we can have similar number of instances in both classes. The advantage of this method is that we do not lose any information about the data. But, as a disadvantage, we may not very accurate predictions on unseen data because of this replication process.

  2. Undersampling: We are selecting some instances from the majority class, so that we can have similar number of instances in both classes. The advantage of this method is that there is no replication process and can create models faster if we have huge data. The disadvantage of this method is that we lose some information in the data.

  3. Oversampling and Undersampling: This method is the combination of both oversampling and undersampling methods at the same time. So, this method have advantage and disadvantage of both methods.

  4. Synthetic Data Generation: This process is about creating new instances from the minority class. Two instance from the minority class is taken and creating a new instance between these two instances. So, we generate a new instances looks like these instances. The advantage of this method is that we are generating data without losing information or replication. The disadvantage is that we may create some instances that could be from the majority class. It is a very common technic to use for imbalanced data.

  5. Cost Sensitive Learning: Normally, we give the same cost for predicting an instance with true or false. Because of that it can be a good model to predict all instances with the majority class. To overcome this issue, we can give costs for the errors. So, our model try to distinguish instances from each other. The advantage is that we can force the model to classify the data rather than predicting all instance from the majority class. The disadvantage is that we may not find the best cost matrix for the model.

Before moving on the data generation steps, we need to check the features variance, correlation and split the data into the train and test set.

colnames(train_data %>% 
  select_if(~n_distinct(.) <= 1))
## [1] "x50" "x52"
train_data <- train_data %>% 
  select_if(~n_distinct(.) > 1)

We moved x50 and x52 columns from the data. They have the same values for all instances. We can check the correlation between features.

M <- cor(train_data %>% select_if(is.numeric))

corrplot(M , method = "number")

We assume that two features are correlated if their correlation is higher than 0.6 or lower than -0.6. So, with this assumption, we can say that these variables are correlated:

-x23 and x56 -x23 and x54 -x28 and x15

In any of the model, we should remove one of the correlated columns.To check the values in all columns, we can create boxplots for all columns.

ggplot(stack(train_data[, c("x1", "x2", "x3", "x4", "x5", "x6")]), aes(x = ind, y = values)) +
  geom_boxplot()

ggplot(stack(train_data[, c("x7", "x8", "x9", "x10", "x11", "x12")]), aes(x = ind, y = values)) +
  geom_boxplot()

ggplot(stack(train_data[, c("x13", "x14", "x15", "x16", "x17", "x18")]), aes(x = ind, y = values)) +
  geom_boxplot()

ggplot(stack(train_data[, c("x19", "x20", "x21", "x22", "x23", "x24")]), aes(x = ind, y = values)) +
  geom_boxplot()

ggplot(stack(train_data[, c("x25", "x26", "x27", "x28", "x29", "x30")]), aes(x = ind, y = values)) +
  geom_boxplot()

ggplot(stack(train_data[, c("x31", "x32", "x33", "x34", "x35", "x36")]), aes(x = ind, y = values)) +
  geom_boxplot()

ggplot(stack(train_data[, c("x37", "x38", "x39", "x40", "x41", "x42")]), aes(x = ind, y = values)) +
  geom_boxplot()

ggplot(stack(train_data[, c("x43", "x44", "x45", "x46", "x47", "x48")]), aes(x = ind, y = values)) +
  geom_boxplot()

ggplot(stack(train_data[, c("x49", "x51", "x53", "x54", "x55", "x56")]), aes(x = ind, y = values)) +
  geom_boxplot()

ggplot(stack(train_data[, c("x57", "x58", "x59", "x60", "y")]), aes(x = ind, y = values)) +
  geom_boxplot()

So we see that we need to turn some columns in to factor, because they have only 0 or 1 values. Also, we can transform the target variable from character to factor.

train_data = train_data %>%
  mutate_at(c("x2" , "x3",  "x4",  "x12" ,"x13",  "x15", "x16", "x17", "x18" ,"x19" ,"x20", "x21", "x22" ,"x23", "x24", "x25" ,"x26", "x28", "x29", "x31", "x33", "x34" ,"x35" , "x37", "x38" ,"x39" ,"x40", "x41",  "x43", "x44", "x45", "x46", "x47" ,"x48", "x49" ,"x51", "x53", "x54", "x55", "x56" ,"x57" ,"x58", "x59", "x60", "y"), factor)

We can split the data. We use the createDataPartition method from the caret package.

set.seed(12345)
split <- createDataPartition(train_data$y, p = .7, 
                             list = FALSE, 
                             times = 1)

train <- train_data[split,]
test  <- train_data[-split,]

For the cost matrix, we will use the formulation below:

model_weights <- ifelse(train$y == "a",
                        (1/table(train$y)[1]) * 0.5,
                        (1/table(train$y)[2]) * 0.5)

Before moving on with sampling methods, we can do some feature engineering processes. In this data, we do not have any information about the columns. So, it would be misleading to create new features, because of having a chance to choose two irrelevant columns.

We can do the sampling methods:

  1. Oversampling
train_over <- ovun.sample(y ~ ., data = train, method = "over",N = 2192)$data
table(train_over$y)
## 
##    a    b 
## 1096 1096
  1. Undersampling
train_under <- ovun.sample(y ~ ., data = train, method = "under", N = 714, seed = 3295)$data
table(train_under$y)
## 
##   a   b 
## 357 357
  1. Oversampling and Undersampling
train_both <- ovun.sample(y ~ ., data = train, method = "both", p=0.5, N=1000, seed = 1)$data
table(train_both$y)
## 
##   a   b 
## 520 480
  1. Synthetic Data Generation:
train_rose <- ROSE(y ~ ., data = train, seed = 1)$data
table(train_rose$y)
## 
##   a   b 
## 764 689

We can do these sampling processes with the caret train function. In the trainControl function, we can define sampling method as down, up, smote or rose. We will also use the results of the train sampling.

Now, we can search for important features in all data. For this purpose, we implemented random forest models for all data. Random forest give us some information about the feature importance. To create the best random forest model, we will do a tuning process. The most important parameter in a random forest model is the number of variables randomly sampled as candidates at each split. We will try to find best value for that parameter. There are also two important parameters, which are number of trees to grow and minimum size of terminal nodes. We will set 500 and 5, respectively. We can use the rf_tuning function for this purpose.

rf_tuning = function(data){
  set.seed(12345) 
  ctrl <- trainControl(method = "cv",
                     number = 10)
  
  model = train(y ~ .,
                data = data,
                trControl = ctrl, 
                tuneGrid =  expand.grid(mtry = seq(floor(ncol(data) / 4), floor(ncol(data) / 2), length.out = 10)),
                ntree = 100,
                nodesize = 5,
                importance = TRUE
  ) 
  model
}

Now we will create these random forest models and choose the important columns from the plot that we can create with VarImpPlot function.

rf_train = rf_tuning(train)
varImpPlot(rf_train$finalModel)

train_varimp = train[,c("x42", "x30", "x23", "x14", "x32", "x36", "x48", "x20", "x44", "x24", "x18", "x13", "x25", "x33", "x60", "x3", "x21", "x51", "x12", "x41", "x47", "x34", "x31", "x58", "x53", "x40", "x39", "x16", "x38", "x10", "x9", "x27", "x11", "x8", "x1", "x7", "x6", "x5", "y")]
rf_train_over = rf_tuning(train_over)
varImpPlot(rf_train_over$finalModel)

train_over_varimp = train_over[, c("x30", "x32", "x14", "x42", "x11", "x9", "x27", "x8", "x1", "x10", "x5", "x7", "x6", "x23", "x53", "x40", "x24", "x34", "x36", "x20", "x48", "x47", "x41", "x2", "x3", "x12", "x51", "x4", "x58", "x17", "x39", "x44", "x38", "y")]
rf_train_under = rf_tuning(train_under)
varImpPlot(rf_train_under$finalModel)

train_under_varimp = train_under[, c("x30", "x32", "x14", "x42", "x11", "x9", "x27", "x8", "x1", "x10", "x5", "x7", "x6", "x23", "x53", "x40", "x24", "x34", "x36", "x20", "x48", "x41", "x3", "x12", "x58", "x17", "x39", "x44", "x38", "x16", "x33", "x35", "x45", "x28", "x18", "x22", "x29", "y")]
rf_train_both = rf_tuning(train_both)
varImpPlot(rf_train_both$finalModel)

train_both_varimp = train_both[, c("x30", "x32", "x14", "x42", "x11", "x9", "x27", "x8", "x1", "x10", "x5", "x7", "x6", "x23", "x53", "x40", "x24", "x34", "x36", "x20", "x48", "x41", "x3",  "x58", "x17", "x39", "x38", "x16", "x33", "x4", "x2", "x47", "y")]
rf_train_rose = rf_tuning(train_rose)
varImpPlot(rf_train_rose$finalModel)

train_rose_varimp = train_rose[, c("x30", "x32", "x42", "x9", "x23", "x53", "x24", "x36", "x20", "x48", "x58", "x17", "x39", "x38", "x16", "x33", "x57", "x59", "x49", "x60", "x29", "x26", "x13", "x21", "x55", "x14", "x22", "x46", "x25", "x43", "x44", "y")]

Another approach is to choose the best features with Recursive Feature Elimination method. In the caret package, there is a method called rfe to implement this method.

control_rfe <- rfeControl(functions=rfFuncs, method="cv", number=10)
results <- rfe(train[, 1:57], train$y, rfeControl=control_rfe)
columns = c(predictors(results), "y")
train_rfe = train[,..columns, with = FALSE]
results <- rfe(train_over[, 1:57], train_over$y, rfeControl=control_rfe)
columns = c(predictors(results), "y")
train_over_rfe = train_over[,columns]
results <- rfe(train_under[, 1:57], train_under$y, rfeControl=control_rfe)
columns = c(predictors(results), "y")
train_under_rfe = train_under[,columns]
results <- rfe(train_both[, 1:57], train_both$y, rfeControl=control_rfe)
columns = c(predictors(results), "y")
train_both_rfe = train_both[,columns]
results <- rfe(train_rose[, 1:57], train_rose$y, rfeControl=control_rfe)
columns = c(predictors(results), "y")
train_rose_rfe = train_rose[,columns]

Also, as the last feature selection option, we can remove the columns that have less than 0.01 variance.

apply(train_data[, -c("y")] %>% mutate_all(as.numeric), 2, sd)[order(apply(train_data[, -c("y")] %>% mutate_all(as.numeric), 2, sd))]
##          x37          x57          x59          x46          x26          x49 
##   0.02195814   0.03104601   0.07894173   0.08475587   0.09018568   0.10013827 
##          x55          x18          x43          x21          x60          x13 
##   0.13063192   0.14089075   0.16212522   0.16628746   0.17427802   0.17938152 
##          x33          x31          x29          x19          x20          x22 
##   0.17938152   0.18063165   0.18187191   0.20594048   0.20911283   0.21222790 
##          x34          x45          x35          x53          x28          x24 
##   0.23715054   0.23804381   0.24843974   0.28088406   0.30173700   0.30489823 
##          x51          x16          x25          x40          x39          x47 
##   0.30677023   0.31645562   0.32454297   0.32956418   0.33011385   0.33283813 
##          x38          x58          x48          x15          x17          x41 
##   0.34888286   0.34937752   0.35085332   0.35758247   0.42637294   0.44050817 
##          x44           x4          x54           x2           x3          x12 
##   0.46161885   0.46241828   0.46664817   0.46990252   0.47314064   0.47476501 
##          x56          x23           x1           x7           x5           x8 
##   0.49507025   0.49985170   4.70147513   5.50136470   5.54478916   5.59920800 
##           x6          x11          x10           x9          x27          x42 
##   5.60845819  56.92567741  57.65754285  58.26309010  70.09150803  73.34424608 
##          x36          x14          x32          x30 
##  93.36944571 118.00349681 147.13864375 158.64833599

It can be seen that we need to remove x37, x57, x59, x46 and x26 columns.

train2 = train[,-c("x37","x57","x59","x46","x26")]

Now, we are ready to implement models these data. From the beginning, we can implement Lasso and Ridge regressions. These are penalized regression models and also linear models. For these models, lambda is an important parameter. We need to find the best value for this model with the help of penalized_tuning function. After tuning the lambda value, we will use lambda.1se value to create the generalized linear model.

penalized_tuning = function(data, alpha = 0){
  X = data.matrix(data[, 1:(ncol(data)-1)])
  y = data.matrix(data$y)

  glm_tune = cv.glmnet(X, y, alpha = alpha, family = "binomial", standardize = TRUE)
  lambda_1se = glm_tune$lambda.1se
  
  model = glmnet(X,
                 y,
                 lambda = lambda_1se,
                 family = "binomial",
                 standardize = TRUE,
                 alpha = alpha)
  model
}

We can create Lasso and Ridge regression models for all data.

lasso_train = penalized_tuning(train, alpha = 1)
ridge_train = penalized_tuning(train, alpha = 0)
lasso_train_over = penalized_tuning(train_over, alpha = 1)
ridge_train_over = penalized_tuning(train_over, alpha = 0)
lasso_train_under = penalized_tuning(train_under, alpha = 1)
ridge_train_under = penalized_tuning(train_under, alpha = 0)
lasso_train_both = penalized_tuning(train_both, alpha = 1)
ridge_train_both = penalized_tuning(train_both, alpha = 0)
lasso_train_rose = penalized_tuning(train_rose, alpha = 1)
ridge_train_rose = penalized_tuning(train_rose, alpha = 0)
lasso_train_rfe = penalized_tuning(train_rfe, alpha = 1)
ridge_train_rfe = penalized_tuning(train_rfe, alpha = 0)
lasso_train_over_rfe = penalized_tuning(train_over_rfe, alpha = 1)
ridge_train_over_rfe = penalized_tuning(train_over_rfe, alpha = 0)
lasso_train_under_rfe = penalized_tuning(train_under_rfe, alpha = 1)
ridge_train_under_rfe = penalized_tuning(train_under_rfe, alpha = 0)
lasso_train_both_rfe = penalized_tuning(train_both_rfe, alpha = 1)
ridge_train_both_rfe = penalized_tuning(train_both_rfe, alpha = 0)
lasso_train_rose_rfe = penalized_tuning(train_rose_rfe, alpha = 1)
ridge_train_rose_rfe = penalized_tuning(train_rose_rfe, alpha = 0)
lasso_train2 = penalized_tuning(train2, alpha = 1)
ridge_train2 = penalized_tuning(train2, alpha = 0)

After creating these models, we need to calculate score for each of them on the test set.

columns = colnames(train)
result = calculate_score_glm(lasso_train, test[, ..columns])
## [1] "AUC is:  0.8458"
## [1] "BER is:  0.8132"
## [1] "OverAll is:  0.8295"
scores = rbind(scores, data.table(data = "train", model = "lasso", AUC = result[1], BER = result[2], OverAll = result[3]))
columns = colnames(train)
result = calculate_score_glm(ridge_train, test[, ..columns])
## [1] "AUC is:  0.8582"
## [1] "BER is:  0.8213"
## [1] "OverAll is:  0.8397"
scores = rbind(scores, data.table(data = "train", model = "ridge", AUC = result[1], BER = result[2], OverAll = result[3]))
columns = colnames(train_over)
result = calculate_score_glm(lasso_train_over, test[, ..columns])
## [1] "AUC is:  0.8517"
## [1] "BER is:  0.7601"
## [1] "OverAll is:  0.8059"
scores = rbind(scores, data.table(data = "train_over", model = "lasso", AUC = result[1], BER = result[2], OverAll = result[3]))
columns = colnames(train_over)
result = calculate_score_glm(ridge_train_over, test[, ..columns])
## [1] "AUC is:  0.8477"
## [1] "BER is:  0.7424"
## [1] "OverAll is:  0.795"
scores = rbind(scores, data.table(data = "train_over", model = "ridge", AUC = result[1], BER = result[2], OverAll = result[3]))
columns = colnames(train_under)
result = calculate_score_glm(lasso_train_under, test[, ..columns])
## [1] "AUC is:  0.8375"
## [1] "BER is:  0.7327"
## [1] "OverAll is:  0.7851"
scores = rbind(scores, data.table(data = "train_under", model = "lasso", AUC = result[1], BER = result[2], OverAll = result[3]))
columns = colnames(train_under)
result = calculate_score_glm(ridge_train_under, test[, ..columns])
## [1] "AUC is:  0.8542"
## [1] "BER is:  0.752"
## [1] "OverAll is:  0.8031"
scores = rbind(scores, data.table(data = "train_under", model = "ridge", AUC = result[1], BER = result[2], OverAll = result[3]))
columns = colnames(train_both)
result = calculate_score_glm(lasso_train_both, test[, ..columns])
## [1] "AUC is:  0.8493"
## [1] "BER is:  0.7601"
## [1] "OverAll is:  0.8047"
scores = rbind(scores, data.table(data = "train_both", model = "lasso", AUC = result[1], BER = result[2], OverAll = result[3]))
columns = colnames(train_both)
result = calculate_score_glm(ridge_train_both, test[, ..columns])
## [1] "AUC is:  0.8515"
## [1] "BER is:  0.752"
## [1] "OverAll is:  0.8017"
scores = rbind(scores, data.table(data = "train_both", model = "ridge", AUC = result[1], BER = result[2], OverAll = result[3]))
columns = colnames(train_rose)
result = calculate_score_glm(lasso_train_rose, test[, ..columns])
## [1] "AUC is:  0.8458"
## [1] "BER is:  0.7456"
## [1] "OverAll is:  0.7957"
scores = rbind(scores, data.table(data = "train_rose", model = "lasso", AUC = result[1], BER = result[2], OverAll = result[3]))
columns = colnames(train_rose)
result = calculate_score_glm(ridge_train_rose, test[, ..columns])
## [1] "AUC is:  0.8445"
## [1] "BER is:  0.7327"
## [1] "OverAll is:  0.7886"
scores = rbind(scores, data.table(data = "train_rose", model = "ridge", AUC = result[1], BER = result[2], OverAll = result[3]))
columns = colnames(train_rfe)
result = calculate_score_glm(lasso_train_rfe, test[, ..columns])
## [1] "AUC is:  0.8436"
## [1] "BER is:  0.8084"
## [1] "OverAll is:  0.826"
scores = rbind(scores, data.table(data = "train_rfe", model = "lasso", AUC = result[1], BER = result[2], OverAll = result[3]))
columns = colnames(train_rfe)
result = calculate_score_glm(ridge_train_rfe, test[, ..columns])
## [1] "AUC is:  0.8389"
## [1] "BER is:  0.8084"
## [1] "OverAll is:  0.8236"
scores = rbind(scores, data.table(data = "train_rfe", model = "ridge", AUC = result[1], BER = result[2], OverAll = result[3]))
columns = colnames(train_over_rfe)
result = calculate_score_glm(lasso_train_over_rfe, test[, ..columns])
## [1] "AUC is:  0.8454"
## [1] "BER is:  0.7568"
## [1] "OverAll is:  0.8011"
scores = rbind(scores, data.table(data = "train_over_rfe", model = "lasso", AUC = result[1], BER = result[2], OverAll = result[3]))
columns = colnames(train_over_rfe)
result = calculate_score_glm(ridge_train_over_rfe, test[, ..columns])
## [1] "AUC is:  0.8342"
## [1] "BER is:  0.744"
## [1] "OverAll is:  0.7891"
scores = rbind(scores, data.table(data = "train_over_rfe", model = "ridge", AUC = result[1], BER = result[2], OverAll = result[3]))
columns = colnames(train_under_rfe)
result = calculate_score_glm(lasso_train_under_rfe, test[, ..columns])
## [1] "AUC is:  0.8351"
## [1] "BER is:  0.7262"
## [1] "OverAll is:  0.7807"
scores = rbind(scores, data.table(data = "train_under_rfe", model = "lasso", AUC = result[1], BER = result[2], OverAll = result[3]))
columns = colnames(train_under_rfe)
result = calculate_score_glm(ridge_train_under_rfe, test[, ..columns])
## [1] "AUC is:  0.8541"
## [1] "BER is:  0.7536"
## [1] "OverAll is:  0.8038"
scores = rbind(scores, data.table(data = "train_under_rfe", model = "ridge", AUC = result[1], BER = result[2], OverAll = result[3]))
columns = colnames(train_both_rfe)
result = calculate_score_glm(lasso_train_both_rfe, test[, ..columns])
## [1] "AUC is:  0.8324"
## [1] "BER is:  0.7343"
## [1] "OverAll is:  0.7833"
scores = rbind(scores, data.table(data = "train_both_rfe", model = "lasso", AUC = result[1], BER = result[2], OverAll = result[3]))
columns = colnames(train_both_rfe)
result = calculate_score_glm(ridge_train_both_rfe, test[, ..columns])
## [1] "AUC is:  0.8341"
## [1] "BER is:  0.7585"
## [1] "OverAll is:  0.7963"
scores = rbind(scores, data.table(data = "train_both_rfe", model = "ridge", AUC = result[1], BER = result[2], OverAll = result[3]))
columns = colnames(train_rose_rfe)
result = calculate_score_glm(lasso_train_rose_rfe, test[, ..columns])
## [1] "AUC is:  0.8424"
## [1] "BER is:  0.7375"
## [1] "OverAll is:  0.79"
scores = rbind(scores, data.table(data = "train_rose_rfe", model = "lasso", AUC = result[1], BER = result[2], OverAll = result[3]))
columns = colnames(train_rose_rfe)
result = calculate_score_glm(ridge_train_rose_rfe, test[, ..columns])
## [1] "AUC is:  0.8417"
## [1] "BER is:  0.7279"
## [1] "OverAll is:  0.7848"
scores = rbind(scores, data.table(data = "train_rose_rfe", model = "ridge", AUC = result[1], BER = result[2], OverAll = result[3]))

ctrl <- trainControl(method = "cv",
                     number = 10,
                     summaryFunction = twoClassSummary,
                     classProbs = TRUE)

glm_weighted <- train(y ~ .,
                      data = train,
                      method = "glm",
                      weights = model_weights,
                      trControl = ctrl)

pred_weighted_glm <- predict(glm_weighted, newdata = test, type = "prob")
result = calculate_score(test$y, pred_weighted_glm[, "b"])
## [1] "AUC is:  0.8556"
## [1] "BER is:  0.7585"
## [1] "OverAll is:  0.807"
scores = rbind(scores, data.table(data = "train", model = "glm_weighted", AUC = result[1], BER = result[2], OverAll = result[3]))

ctrl$sampling = "up"

glm_sampling_up <- train(y ~ .,
                         data = train,
                         method = "glm",
                         trControl = ctrl)

pred_glm_sampling_up <- predict(glm_sampling_up, newdata = test, type = "prob")
result = calculate_score(test$y, pred_glm_sampling_up[, "b"])
## [1] "AUC is:  0.8564"
## [1] "BER is:  0.7552"
## [1] "OverAll is:  0.8058"
scores = rbind(scores, data.table(data = "train", model = "glm_up", AUC = result[1], BER = result[2], OverAll = result[3]))

glm_sampling_up2 <- train(y ~ .,
                         data = train2,
                         method = "glm",
                         trControl = ctrl)

pred_glm_sampling_up2 <- predict(glm_sampling_up2, newdata = test, type = "prob")
result = calculate_score(test$y, pred_glm_sampling_up2[, "b"])
## [1] "AUC is:  0.8553"
## [1] "BER is:  0.7601"
## [1] "OverAll is:  0.8077"
scores = rbind(scores, data.table(data = "train2", model = "glm_up", AUC = result[1], BER = result[2], OverAll = result[3]))

ctrl$sampling = "down"

glm_sampling_down <- train(y ~ .,
                           data = train,
                           method = "glm",
                           trControl = ctrl)

pred_glm_sampling_down <- predict(glm_sampling_down, newdata = test, type = "prob")
result = calculate_score(test$y, pred_glm_sampling_down[, "b"])
## [1] "AUC is:  0.8505"
## [1] "BER is:  0.7456"
## [1] "OverAll is:  0.798"
scores = rbind(scores, data.table(data = "train", model = "glm_down", AUC = result[1], BER = result[2], OverAll = result[3]))

glm_sampling_down2 <- train(y ~ .,
                           data = train2,
                           method = "glm",
                           trControl = ctrl)

pred_glm_sampling_down2 <- predict(glm_sampling_down2, newdata = test, type = "prob")
result = calculate_score(test$y, pred_glm_sampling_down2[, "b"])
## [1] "AUC is:  0.8594"
## [1] "BER is:  0.7504"
## [1] "OverAll is:  0.8049"
scores = rbind(scores, data.table(data = "train2", model = "glm_down", AUC = result[1], BER = result[2], OverAll = result[3]))

ctrl$sampling = "smote"

glm_sampling_smote <- train(y ~ .,
                            data = train,
                            method = "glm",
                            trControl = ctrl)

pred_glm_sampling_smote <- predict(glm_sampling_smote, newdata = test, type = "prob")
result = calculate_score(test$y, pred_glm_sampling_smote[, "b"])
## [1] "AUC is:  0.8521"
## [1] "BER is:  0.7923"
## [1] "OverAll is:  0.8222"
scores = rbind(scores, data.table(data = "train", model = "glm_smote", AUC = result[1], BER = result[2], OverAll = result[3]))

glm_sampling_smote2 <- train(y ~ .,
                            data = train2,
                            method = "glm",
                            trControl = ctrl)

pred_glm_sampling_smote2 <- predict(glm_sampling_smote2, newdata = test, type = "prob")
result = calculate_score(test$y, pred_glm_sampling_smote2[, "b"])
## [1] "AUC is:  0.8535"
## [1] "BER is:  0.781"
## [1] "OverAll is:  0.8173"
scores = rbind(scores, data.table(data = "train2", model = "glm_smote", AUC = result[1], BER = result[2], OverAll = result[3]))

ctrl$sampling = "rose"

glm_sampling_rose <- train(y ~ .,
                           data = train,
                           method = "glm",
                           trControl = ctrl)

pred_glm_sampling_rose <- predict(glm_sampling_rose, newdata = test, type = "prob")
result = calculate_score(test$y, pred_glm_sampling_rose[, "b"])
## [1] "AUC is:  0.8503"
## [1] "BER is:  0.7407"
## [1] "OverAll is:  0.7955"
scores = rbind(scores, data.table(data = "train", model = "glm_rose", AUC = result[1], BER = result[2], OverAll = result[3]))

glm_sampling_rose2 <- train(y ~ .,
                           data = train2,
                           method = "glm",
                           trControl = ctrl)

pred_glm_sampling_rose2 <- predict(glm_sampling_rose2, newdata = test, type = "prob")
result = calculate_score(test$y, pred_glm_sampling_rose2[, "b"])
## [1] "AUC is:  0.8477"
## [1] "BER is:  0.7552"
## [1] "OverAll is:  0.8015"
scores = rbind(scores, data.table(data = "train2", model = "glm_rose", AUC = result[1], BER = result[2], OverAll = result[3]))

These are the scores for Lasso and Ridge Regressions. We can try for nonlinear models like Model Based Decision Tree, Random Forest or Stochastic Gradient Boosting (SGB).

For Random Forest, we can tune parameters with the rf_tuning function explained before.

rf_train2 = rf_tuning(train2)
rf_train_varimp = rf_tuning(train_varimp)
rf_train_over_varimp = rf_tuning(train_over_varimp)
rf_train_under_varimp = rf_tuning(train_under_varimp)
rf_train_both_varimp = rf_tuning(train_both_varimp)
rf_train_rose_varimp = rf_tuning(train_rose_varimp)
rf_train_rfe = rf_tuning(train_rfe)
rf_train_over_rfe = rf_tuning(train_over_rfe)
rf_train_under_rfe = rf_tuning(train_under_rfe)
rf_train_both_rfe = rf_tuning(train_both_rfe)
rf_train_rose_rfe = rf_tuning(train_rose_rfe)

After creating these models, we need to calculate score for each of them on the test set.

result = calculate_score_rf(rf_train, test)
## [1] "AUC is:  0.8528"
## [1] "BER is:  0.8341"
## [1] "OverAll is:  0.8435"
scores = rbind(scores, data.table(data = "train", model = "rf", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_rf(rf_train2, test)
## [1] "AUC is:  0.8407"
## [1] "BER is:  0.8213"
## [1] "OverAll is:  0.831"
scores = rbind(scores, data.table(data = "train2", model = "rf", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_rf(rf_train_over, test)
## [1] "AUC is:  0.8407"
## [1] "BER is:  0.8084"
## [1] "OverAll is:  0.8245"
scores = rbind(scores, data.table(data = "train_over", model = "rf", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_rf(rf_train_under, test)
## [1] "AUC is:  0.8491"
## [1] "BER is:  0.7407"
## [1] "OverAll is:  0.7949"
scores = rbind(scores, data.table(data = "train_under", model = "rf", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_rf(rf_train_both, test)
## [1] "AUC is:  0.841"
## [1] "BER is:  0.7939"
## [1] "OverAll is:  0.8174"
scores = rbind(scores, data.table(data = "train_both", model = "rf", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_rf(rf_train_rose, test)
## [1] "AUC is:  0.835"
## [1] "BER is:  0.7971"
## [1] "OverAll is:  0.816"
scores = rbind(scores, data.table(data = "train_rose", model = "rf", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_rf(rf_train_varimp, test)
## [1] "AUC is:  0.8458"
## [1] "BER is:  0.8132"
## [1] "OverAll is:  0.8295"
scores = rbind(scores, data.table(data = "train_varimp", model = "rf", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_rf(rf_train_over_varimp, test)
## [1] "AUC is:  0.8391"
## [1] "BER is:  0.8035"
## [1] "OverAll is:  0.8213"
scores = rbind(scores, data.table(data = "train_over_varimp", model = "rf", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_rf(rf_train_under_varimp, test)
## [1] "AUC is:  0.8348"
## [1] "BER is:  0.7327"
## [1] "OverAll is:  0.7837"
scores = rbind(scores, data.table(data = "train_under_varimp", model = "rf", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_rf(rf_train_both_varimp, test)
## [1] "AUC is:  0.8334"
## [1] "BER is:  0.7858"
## [1] "OverAll is:  0.8096"
scores = rbind(scores, data.table(data = "train_both_varimp", model = "rf", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_rf(rf_train_rose_varimp, test)
## [1] "AUC is:  0.8276"
## [1] "BER is:  0.7987"
## [1] "OverAll is:  0.8131"
scores = rbind(scores, data.table(data = "train_rose_varimp", model = "rf", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_rf(rf_train_rfe, test)
## [1] "AUC is:  0.8412"
## [1] "BER is:  0.8325"
## [1] "OverAll is:  0.8369"
scores = rbind(scores, data.table(data = "train_rfe", model = "rf", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_rf(rf_train_over_rfe, test)
## [1] "AUC is:  0.8395"
## [1] "BER is:  0.8213"
## [1] "OverAll is:  0.8304"
scores = rbind(scores, data.table(data = "train_over_rfe", model = "rf", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_rf(rf_train_under_rfe, test)
## [1] "AUC is:  0.8426"
## [1] "BER is:  0.7359"
## [1] "OverAll is:  0.7892"
scores = rbind(scores, data.table(data = "train_under_rfe", model = "rf", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_rf(rf_train_both_rfe, test)
## [1] "AUC is:  0.8248"
## [1] "BER is:  0.7826"
## [1] "OverAll is:  0.8037"
scores = rbind(scores, data.table(data = "train_both_rfe", model = "rf", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_rf(rf_train_rose_rfe, test)
## [1] "AUC is:  0.8227"
## [1] "BER is:  0.8019"
## [1] "OverAll is:  0.8123"
scores = rbind(scores, data.table(data = "train_rose_rfe", model = "rf", AUC = result[1], BER = result[2], OverAll = result[3]))

ctrl <- trainControl(method = "cv",
                     number = 10,
                     summaryFunction = twoClassSummary,
                     classProbs = TRUE)

rf_weighted <- train(y ~ .,
                     data = train,
                     method = "rf",
                     weights = model_weights,
                     tuneGrid =  expand.grid(mtry = seq(floor(ncol(train) / 4), floor(ncol(train) / 2), length.out = 20)),
                     ntree = 100,
                     nodesize = 5,
                     metric = "ROC",
                     verbose = FALSE,
                     trControl = ctrl)

pred_weighted_rf <- predict(rf_weighted, newdata = test, type = "prob")
result = calculate_score(test$y, pred_weighted_rf[, "b"])
## [1] "AUC is:  0.8486"
## [1] "BER is:  0.8261"
## [1] "OverAll is:  0.8373"
scores = rbind(scores, data.table(data = "train", model = "rf_weighted", AUC = result[1], BER = result[2], OverAll = result[3]))

ctrl$sampling = "down"

rf_sampling_down <- train(y ~ .,
                           data = train,
                           method = "rf",
                           tuneGrid =  expand.grid(mtry = seq(floor(ncol(train) / 4), floor(ncol(train) / 2), length.out = 20)),
                           ntree = 100,
                           nodesize = 5,
                           metric = "ROC",
                           verbose = FALSE,
                           trControl = ctrl)

pred_rf_sampling_down <- predict(rf_sampling_down, newdata = test, type = "prob")
result = calculate_score(test$y, pred_rf_sampling_down[, "b"])
## [1] "AUC is:  0.8425"
## [1] "BER is:  0.7375"
## [1] "OverAll is:  0.79"
scores = rbind(scores, data.table(data = "train", model = "rf_down", AUC = result[1], BER = result[2], OverAll = result[3]))

rf_sampling_down2 <- train(y ~ .,
                           data = train2,
                           method = "rf",
                           tuneGrid =  expand.grid(mtry = seq(floor(ncol(train) / 4), floor(ncol(train) / 2), length.out = 20)),
                           ntree = 100,
                           nodesize = 5,
                           metric = "ROC",
                           verbose = FALSE,
                           trControl = ctrl)

pred_rf_sampling_down2 <- predict(rf_sampling_down2, newdata = test, type = "prob")
result = calculate_score(test$y, pred_rf_sampling_down2[, "b"])
## [1] "AUC is:  0.8482"
## [1] "BER is:  0.7504"
## [1] "OverAll is:  0.7993"
scores = rbind(scores, data.table(data = "train2", model = "rf_down", AUC = result[1], BER = result[2], OverAll = result[3]))

ctrl$sampling = "up"

rf_sampling_up <- train(y ~ .,
                        data = train,
                        method = "rf",
                        tuneGrid =  expand.grid(mtry = seq(floor(ncol(train) / 4), floor(ncol(train) / 2), length.out = 20)),
                        ntree = 100,
                        nodesize = 5,
                        metric = "ROC",
                        verbose = FALSE,
                        trControl = ctrl)

pred_rf_sampling_up <- predict(rf_sampling_up, newdata = test, type = "prob")
result = calculate_score(test$y, pred_rf_sampling_up[, "b"])
## [1] "AUC is:  0.841"
## [1] "BER is:  0.8068"
## [1] "OverAll is:  0.8239"
scores = rbind(scores, data.table(data = "train", model = "rf_up", AUC = result[1], BER = result[2], OverAll = result[3]))

rf_sampling_up2 <- train(y ~ .,
                        data = train2,
                        method = "rf",
                        tuneGrid =  expand.grid(mtry = seq(floor(ncol(train) / 4), floor(ncol(train) / 2), length.out = 20)),
                        ntree = 100,
                        nodesize = 5,
                        metric = "ROC",
                        verbose = FALSE,
                        trControl = ctrl)

pred_rf_sampling_up2 <- predict(rf_sampling_up2, newdata = test, type = "prob")
result = calculate_score(test$y, pred_rf_sampling_up2[, "b"])
## [1] "AUC is:  0.8396"
## [1] "BER is:  0.8116"
## [1] "OverAll is:  0.8256"
scores = rbind(scores, data.table(data = "train2", model = "rf_up", AUC = result[1], BER = result[2], OverAll = result[3]))

ctrl$sampling = "rose"

rf_sampling_rose <- train(y ~ .,
                        data = train,
                        method = "rf",
                        tuneGrid =  expand.grid(mtry = seq(floor(ncol(train) / 4), floor(ncol(train) / 2), length.out = 20)),
                        ntree = 100,
                        nodesize = 5,
                        metric = "ROC",
                        verbose = FALSE,
                        trControl = ctrl)

pred_rf_sampling_rose <- predict(rf_sampling_rose, newdata = test, type = "prob")
result = calculate_score(test$y, 1- pred_rf_sampling_rose[, "b"])
## [1] "AUC is:  0.8333"
## [1] "BER is:  0.7504"
## [1] "OverAll is:  0.7919"
scores = rbind(scores, data.table(data = "train", model = "rf_rose", AUC = result[1], BER = result[2], OverAll = result[3]))

rf_sampling_rose2 <- train(y ~ .,
                        data = train2,
                        method = "rf",
                        tuneGrid =  expand.grid(mtry = seq(floor(ncol(train) / 4), floor(ncol(train) / 2), length.out = 20)),
                        ntree = 100,
                        nodesize = 5,
                        metric = "ROC",
                        verbose = FALSE,
                        trControl = ctrl)

pred_rf_sampling_rose2 <- predict(rf_sampling_rose2, newdata = test, type = "prob")
result = calculate_score(test$y, 1- pred_rf_sampling_rose2[, "b"])
## [1] "AUC is:  0.846"
## [1] "BER is:  0.314"
## [1] "OverAll is:  0.58"
scores = rbind(scores, data.table(data = "train2", model = "rf_rose", AUC = result[1], BER = result[2], OverAll = result[3]))

ctrl$sampling = "smote"

rf_sampling_smote <- train(y ~ .,
                           data = train,
                           method = "rf",
                           tuneGrid =  expand.grid(mtry = seq(floor(ncol(train) / 4), floor(ncol(train) / 2), length.out = 20)),
                           ntree = 100,
                           nodesize = 5,
                           metric = "ROC",
                           verbose = FALSE,
                           trControl = ctrl)

pred_rf_sampling_smote <- predict(rf_sampling_smote, newdata = test, type = "prob")
result = calculate_score(test$y, pred_rf_sampling_smote[, "b"])
## [1] "AUC is:  0.8395"
## [1] "BER is:  0.8164"
## [1] "OverAll is:  0.8279"
scores = rbind(scores, data.table(data = "train", model = "rf_smote", AUC = result[1], BER = result[2], OverAll = result[3]))

rf_sampling_smote2 <- train(y ~ .,
                           data = train2,
                           method = "rf",
                           tuneGrid =  expand.grid(mtry = seq(floor(ncol(train) / 4), floor(ncol(train) / 2), length.out = 20)),
                           ntree = 100,
                           nodesize = 5,
                           metric = "ROC",
                           verbose = FALSE,
                           trControl = ctrl)

pred_rf_sampling_smote2 <- predict(rf_sampling_smote2, newdata = test, type = "prob")
result = calculate_score(test$y, pred_rf_sampling_smote2[, "b"])
## [1] "AUC is:  0.8405"
## [1] "BER is:  0.8116"
## [1] "OverAll is:  0.826"
scores = rbind(scores, data.table(data = "train2", model = "rf_smote", AUC = result[1], BER = result[2], OverAll = result[3]))

Now, we can implement the SGB models for these data. The most important parameters in a Stochastic Gradient Boosting model are depth of the tree, learning rate and number of trees. We will try to find best value for that parameter. There is also one more import parameter, which is the minimal number of observations per tree leaf. We will set 10 for this value. For parameter tuning, we will use sgb_tuning function.

sgb_tuning = function(data){
  data_used = data %>%
    mutate(y = ifelse(y == "a", 1, 0))

  grid = expand.grid(shrinkage = seq(.01, .1, by = 0.005), interaction.depth = c(3, 5, 7, 9), n.trees = c(80, 100, 120, 150), n.minobsinnode = 10)
  
  for(i in 1:nrow(grid)) {
    set.seed(12345)
    
    gbm_model <- gbm(y ~ .,
                     data = data_used,
                     distribution = "bernoulli",
                     interaction.depth = grid$interaction.depth[i],
                     n.trees = grid$n.trees[i],
                     shrinkage = grid$shrinkage[i],
                     n.minobsinnode = grid$n.minobsinnode[i],
                     train.fraction = .75,
                     n.cores = NULL,
                     verbose = FALSE
    )
    grid$min_RMSE[i] <- sqrt(min(gbm_model$valid.error))
  }
  best_param = grid[which.min(grid$min_RMSE),]
  
  model <- gbm(y ~ .,
                   data = data_used,
                   distribution = "bernoulli",
                   interaction.depth = best_param$interaction.depth,
                   n.trees = best_param$n.trees,
                   shrinkage = best_param$shrinkage,
                   n.minobsinnode = best_param$n.minobsinnode,
                   train.fraction = .75,
                   n.cores = NULL,
                   verbose = FALSE
  ) 
  model
}
sgb_train = sgb_tuning(train)
sgb_train2 = sgb_tuning(train2)
sgb_train_over = sgb_tuning(train_over)
sgb_train_under = sgb_tuning(train_under)
sgb_train_both = sgb_tuning(train_both)
sgb_train_rose = sgb_tuning(train_rose)
sgb_train_varimp = sgb_tuning(train_varimp)
sgb_train_over_varimp = sgb_tuning(train_over_varimp)
sgb_train_under_varimp = sgb_tuning(train_under_varimp)
sgb_train_both_varimp = sgb_tuning(train_both_varimp)
sgb_train_rose_varimp = sgb_tuning(train_rose_varimp)
sgb_train_rfe = sgb_tuning(train_rfe)
sgb_train_over_rfe = sgb_tuning(train_over_rfe)
sgb_train_under_rfe = sgb_tuning(train_under_rfe)
sgb_train_both_rfe = sgb_tuning(train_both_rfe)
sgb_train_rose_rfe = sgb_tuning(train_rose_rfe)

After creating these models, we need to calculate score for each of them on the test set.

result = calculate_score_sgb(sgb_train, test)
## [1] "AUC is:  0.8447"
## [1] "BER is:  0.8229"
## [1] "OverAll is:  0.8338"
scores = rbind(scores, data.table(data = "train", model = "sgb", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_sgb(sgb_train2, test)
## [1] "AUC is:  0.8447"
## [1] "BER is:  0.8229"
## [1] "OverAll is:  0.8338"
scores = rbind(scores, data.table(data = "train2", model = "sgb", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_sgb(sgb_train_over, test)
## [1] "AUC is:  0.838"
## [1] "BER is:  0.81"
## [1] "OverAll is:  0.824"
scores = rbind(scores, data.table(data = "train_over", model = "sgb", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_sgb(sgb_train_under, test)
## [1] "AUC is:  0.8468"
## [1] "BER is:  0.81"
## [1] "OverAll is:  0.8284"
scores = rbind(scores, data.table(data = "train_under", model = "sgb", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_sgb(sgb_train_both, test)
## [1] "AUC is:  0.8477"
## [1] "BER is:  0.8229"
## [1] "OverAll is:  0.8353"
scores = rbind(scores, data.table(data = "train_both", model = "sgb", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_sgb(sgb_train_rose, test)
## [1] "AUC is:  0.8133"
## [1] "BER is:  0.7826"
## [1] "OverAll is:  0.798"
scores = rbind(scores, data.table(data = "train_rose", model = "sgb", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_sgb(sgb_train_varimp, test)
## [1] "AUC is:  0.8486"
## [1] "BER is:  0.8261"
## [1] "OverAll is:  0.8373"
scores = rbind(scores, data.table(data = "train_varimp", model = "sgb", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_sgb(sgb_train_over_varimp, test)
## [1] "AUC is:  0.8364"
## [1] "BER is:  0.8132"
## [1] "OverAll is:  0.8248"
scores = rbind(scores, data.table(data = "train_over_varimp", model = "sgb", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_sgb(sgb_train_under_varimp, test)
## [1] "AUC is:  0.8493"
## [1] "BER is:  0.8068"
## [1] "OverAll is:  0.8281"
scores = rbind(scores, data.table(data = "train_under_varimp", model = "sgb", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_sgb(sgb_train_both_varimp, test)
## [1] "AUC is:  0.8456"
## [1] "BER is:  0.8148"
## [1] "OverAll is:  0.8302"
scores = rbind(scores, data.table(data = "train_both_varimp", model = "sgb", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_sgb(sgb_train_rose_varimp, test)
## [1] "AUC is:  0.833"
## [1] "BER is:  0.7826"
## [1] "OverAll is:  0.8078"
scores = rbind(scores, data.table(data = "train_rose_varimp", model = "sgb", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_sgb(sgb_train_rfe, test)
## [1] "AUC is:  0.8544"
## [1] "BER is:  0.8277"
## [1] "OverAll is:  0.841"
scores = rbind(scores, data.table(data = "train_rfe", model = "sgb", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_sgb(sgb_train_over_rfe, test)
## [1] "AUC is:  0.834"
## [1] "BER is:  0.8164"
## [1] "OverAll is:  0.8252"
scores = rbind(scores, data.table(data = "train_over_rfe", model = "sgb", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_sgb(sgb_train_under_rfe, test)
## [1] "AUC is:  0.8468"
## [1] "BER is:  0.81"
## [1] "OverAll is:  0.8284"
scores = rbind(scores, data.table(data = "train_under_rfe", model = "sgb", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_sgb(sgb_train_both_rfe, test)
## [1] "AUC is:  0.8052"
## [1] "BER is:  0.7729"
## [1] "OverAll is:  0.7891"
scores = rbind(scores, data.table(data = "train_both_rfe", model = "sgb", AUC = result[1], BER = result[2], OverAll = result[3]))
result = calculate_score_sgb(sgb_train_rose_rfe, test)
## [1] "AUC is:  0.8148"
## [1] "BER is:  0.7826"
## [1] "OverAll is:  0.7987"
scores = rbind(scores, data.table(data = "train_rose_rfe", model = "sgb", AUC = result[1], BER = result[2], OverAll = result[3]))

ctrl <- trainControl(method = "cv",
                     number = 10,
                     summaryFunction = twoClassSummary,
                     classProbs = TRUE)

grid = expand.grid(shrinkage = seq(.01, .1, by = 0.005), interaction.depth = c(3, 5, 7, 9), n.trees = c(80, 100, 120, 150), n.minobsinnode = 10)

train_sgb = train %>%
  mutate(y = as.factor(y))
train_sgb2 = train2 %>%
  mutate(y = as.factor(y))
test_sgb = test %>%
  mutate(y = as.factor(y))

sgb_weighted <- train(y ~ .,
                      data = train_sgb,
                      method = "gbm",
                      weights = model_weights,
                      distribution = "bernoulli",
                      tuneGrid =  grid,
                      verbose = FALSE,
                      trControl = ctrl)

pred_weighted_sgb <- predict(sgb_weighted, newdata = test_sgb, type = "prob")
result = calculate_score(test_sgb$y, pred_weighted_sgb[, "b"])
## [1] "AUC is:  0.8572"
## [1] "BER is:  0.7633"
## [1] "OverAll is:  0.8102"
scores = rbind(scores, data.table(data = "train", model = "sgb_weighted", AUC = result[1], BER = result[2], OverAll = result[3]))

ctrl$sampling = "down"

sgb_sampling_down <- train(y ~ .,
                           data = train_sgb,
                           method = "gbm",
                           distribution = "bernoulli",
                           tuneGrid =  grid,
                           verbose = FALSE,
                           trControl = ctrl)

pred_sgb_sampling_down <- predict(sgb_sampling_down, newdata = test_sgb, type = "prob")
result = calculate_score(test$y, pred_sgb_sampling_down[, "b"])
## [1] "AUC is:  0.8561"
## [1] "BER is:  0.7456"
## [1] "OverAll is:  0.8008"
scores = rbind(scores, data.table(data = "train", model = "sgb_down", AUC = result[1], BER = result[2], OverAll = result[3]))

sgb_sampling_down2 <- train(y ~ .,
                           data = train_sgb2,
                           method = "gbm",
                           distribution = "bernoulli",
                           tuneGrid =  grid,
                           verbose = FALSE,
                           trControl = ctrl)

pred_sgb_sampling_down2 <- predict(sgb_sampling_down2, newdata = test_sgb, type = "prob")
result = calculate_score(test$y, pred_sgb_sampling_down2[, "b"])
## [1] "AUC is:  0.8547"
## [1] "BER is:  0.7472"
## [1] "OverAll is:  0.8009"
scores = rbind(scores, data.table(data = "train2", model = "sgb_down", AUC = result[1], BER = result[2], OverAll = result[3]))

ctrl$sampling = "up"

sgb_sampling_up <- train(y ~ .,
                         data = train_sgb,
                         method = "gbm",
                         distribution = "bernoulli",
                         tuneGrid =  grid,
                         verbose = FALSE,
                         trControl = ctrl)

pred_sgb_sampling_up <- predict(sgb_sampling_up, newdata = test_sgb, type = "prob")
result = calculate_score(test$y, pred_sgb_sampling_up[, "b"])
## [1] "AUC is:  0.8585"
## [1] "BER is:  0.752"
## [1] "OverAll is:  0.8053"
scores = rbind(scores, data.table(data = "train", model = "sgb_up", AUC = result[1], BER = result[2], OverAll = result[3]))

sgb_sampling_up2 <- train(y ~ .,
                         data = train_sgb2,
                         method = "gbm",
                         distribution = "bernoulli",
                         tuneGrid =  grid,
                         verbose = FALSE,
                         trControl = ctrl)

pred_sgb_sampling_up2 <- predict(sgb_sampling_up2, newdata = test_sgb, type = "prob")
result = calculate_score(test$y, pred_sgb_sampling_up2[, "b"])
## [1] "AUC is:  0.8521"
## [1] "BER is:  0.7729"
## [1] "OverAll is:  0.8125"
scores = rbind(scores, data.table(data = "train2", model = "sgb_up", AUC = result[1], BER = result[2], OverAll = result[3]))

ctrl$sampling = "rose"

sgb_sampling_rose <- train(y ~ .,
                           data = train_sgb,
                           method = "gbm",
                           distribution = "bernoulli",
                           tuneGrid =  grid,
                           verbose = FALSE,
                           trControl = ctrl)

pred_sgb_sampling_rose <- predict(sgb_sampling_rose, newdata = test_sgb, type = "prob")
result = calculate_score(test$y, pred_sgb_sampling_rose[, "b"])
## [1] "AUC is:  0.5825"
## [1] "BER is:  0.2512"
## [1] "OverAll is:  0.4168"
scores = rbind(scores, data.table(data = "train", model = "sgb_rose", AUC = result[1], BER = result[2], OverAll = result[3]))

sgb_sampling_rose2 <- train(y ~ .,
                           data = train_sgb2,
                           method = "gbm",
                           distribution = "bernoulli",
                           tuneGrid =  grid,
                           verbose = FALSE,
                           trControl = ctrl)

pred_sgb_sampling_rose2 <- predict(sgb_sampling_rose2, newdata = test_sgb, type = "prob")
result = calculate_score(test$y, pred_sgb_sampling_rose2[, "b"])
## [1] "AUC is:  0.8539"
## [1] "BER is:  0.686"
## [1] "OverAll is:  0.77"
scores = rbind(scores, data.table(data = "train2", model = "sgb_rose", AUC = result[1], BER = result[2], OverAll = result[3]))

ctrl$sampling = "smote"

sgb_sampling_smote <- train(y ~ .,
                            data = train_sgb,
                            method = "gbm",
                            distribution = "bernoulli",
                            tuneGrid =  grid,
                            verbose = FALSE,
                            trControl = ctrl)

pred_sgb_sampling_smote <- predict(sgb_sampling_smote, newdata = test_sgb, type = "prob")
result = calculate_score(test$y, pred_sgb_sampling_smote[, "b"])
## [1] "AUC is:  0.8468"
## [1] "BER is:  0.8132"
## [1] "OverAll is:  0.83"
scores = rbind(scores, data.table(data = "train", model = "sgb_smote", AUC = result[1], BER = result[2], OverAll = result[3]))

sgb_sampling_smote2 <- train(y ~ .,
                            data = train_sgb2,
                            method = "gbm",
                            distribution = "bernoulli",
                            tuneGrid =  grid,
                            verbose = FALSE,
                            trControl = ctrl)

pred_sgb_sampling_smote2 <- predict(sgb_sampling_smote2, newdata = test_sgb, type = "prob")
result = calculate_score(test$y, pred_sgb_sampling_smote2[, "b"])
## [1] "AUC is:  0.852"
## [1] "BER is:  0.81"
## [1] "OverAll is:  0.831"
scores = rbind(scores, data.table(data = "train2", model = "sgb_smote", AUC = result[1], BER = result[2], OverAll = result[3]))

3. RESULT & CONCLUSION

We split the data as train and test data. From these methods, you can find the best 5 results of these models.

head(scores, 5)
##           data model    AUC    BER OverAll
## 1:       train lasso 0.8458 0.8132  0.8295
## 2:       train ridge 0.8582 0.8213  0.8397
## 3:  train_over lasso 0.8517 0.7601  0.8059
## 4:  train_over ridge 0.8477 0.7424  0.7950
## 5: train_under lasso 0.8375 0.7327  0.7851

When we uploaded the SGB model that we trained on the data that we applied undersampling method and feature selection based on the variance approach, we get 0.8901 accuracy. So, we can say that undersampling and SGB model is a good method for this model. As a feature selection method, looking for the variance of the data is a more reliable approach than the other methods.